library(survival)
library(survminer)
library(cgdsr)
library(sparklyr)
library(dplyr)
cgds <- cgdsr::CGDS("http://www.cbioportal.org/public-portal/")
#Studies<- cgdsr::getCancerStudies(cgds)
clinicalData <- cgdsr::getClinicalData(cgds, "gbm_tcga_pub_all")
#clinicalData <- read.csv("Clinical_tab.csv") #, na.strings=c("","NA")
clinicalData[c('DFS_MONTHS','DFS_STATUS', 'OS_MONTHS', 'OS_STATUS', 'TREATMENT_STATUS' )]
clinicalData$OS_STATUS <- gsub("LIVING", "0", clinicalData$OS_STATUS, ignore.case = TRUE)
clinicalData$OS_STATUS <- gsub("DECEASED", "1", clinicalData$OS_STATUS, ignore.case = TRUE)
clinicalData$DFS_STATUS <- gsub("^$|^ $", "DiseaseFree", clinicalData$DFS_STATUS, ignore.case = TRUE)
clinicalData$OS_STATUS <- as.numeric(clinicalData$OS_STATUS)
fit <- survival::survfit(Surv(OS_MONTHS, OS_STATUS) ~ DFS_STATUS, data = clinicalData)
survminer::ggsurvplot(fit, data = clinicalData,
type = "kaplan-meier",
#conf.type="log",
conf.int = TRUE,
pval = TRUE,
fun = "pct",
risk.table = TRUE,
size = 1,
linetype = "strata",
palette = c("#E7B800", "#2E9FDF"),
legend = "top",
lengend.title = "DFS_STATUS",
legend.labs = c("DiseaseFree", "Recurred")
)
clinicalData <- cgdsr::getClinicalData(cgds, "gbm_tcga_pub_all")
start_time <- Sys.time()
clinicalData %>%
mutate(OS_STATUS = gsub("LIVING", "0", OS_STATUS)) %>%
mutate(OS_STATUS = gsub( "DECEASED", "1", OS_STATUS)) %>%
mutate(DFS_STATUS = gsub( "^$|^ $", "DiseaseFree", DFS_STATUS)) %>%
mutate(OS_STATUS = as.numeric(OS_STATUS)) %>%
arrange(OS_MONTHS) %>%
mutate( DiseaseFree = ifelse(DFS_STATUS == "DiseaseFree", 1, 0)) %>%
as.data.frame() %>%
mutate(n_DiseaseFree = cumsum(DiseaseFree == 1)) %>%
mutate(n_Recurred = cumsum(DiseaseFree == 0)) %>%
ggplot(aes(x = OS_MONTHS, y = value, color = variable)) +
geom_point(aes(y = n_DiseaseFree, col = "n_DiseaseFree")) +
geom_point(aes(y = n_Recurred, col = "n_Recurred")) +
labs(title = paste("Using R Session, Running time = ", Sys.time() - start_time))
clinicalData <- cgdsr::getClinicalData(cgds, "gbm_tcga_pub_all")
sc <- spark_connect(master = "local",
version = "2.4.0")
Re-using existing Spark connection to local
clinicalData_tbl <- dplyr::copy_to(sc, clinicalData, overwrite = TRUE)
start_time <- Sys.time()
clinicalData_tbl %>%
mutate(OS_STATUS = regexp_replace(OS_STATUS, "LIVING", "0")) %>%
mutate(OS_STATUS = regexp_replace(OS_STATUS, "DECEASED", "1")) %>%
mutate(DFS_STATUS = regexp_replace(DFS_STATUS, "^$|^ $", "DiseaseFree")) %>%
mutate(OS_STATUS = as.numeric(OS_STATUS)) %>%
#mutate(OS_STATUS = regexp_replace(as.numeric(OS_STATUS), 'NaN', NA)) %>%
#mutate(OS_STATUS = regexp_replace(OS_STATUS, NaN, NA)) %>%
#na.replace('') %>% ## not good for OS_STATUS (0,1)
#dplyr::filter(!is.na(OS_MONTHS))
arrange(is.na(OS_MONTHS), OS_MONTHS) %>% ## OUFFF put Nan at the end of the column
mutate(DiseaseFree = ifelse(DFS_STATUS == "DiseaseFree", 1, 0)) %>%
as.data.frame() %>%
mutate( n_DiseaseFree = cumsum(as.numeric(DiseaseFree == 1 ))) %>%
mutate( n_Recurred = cumsum(as.numeric(DiseaseFree == 0 ))) %>%
ggplot(aes(x = OS_MONTHS, y = value, color = variable)) +
geom_point(aes(y = n_DiseaseFree, col = "n_DiseaseFree")) +
geom_point(aes(y = n_Recurred, col = "n_Recurred")) +
labs(title = paste("Using Spark Node, Running time = ", Sys.time() - start_time))
ml_aft_survival_regressionsurvival packagelibrary(survival)
ovarian
# MAGIC - **futime**: survival or censoring time
# MAGIC - **fustat**: censoring status
# MAGIC - **age**: in years
# MAGIC - **resid_ds**: residual disease present (1=no, 2=yes)
# MAGIC - **rx**: treatment group
# MAGIC - **ecog.ps**: ECOG performance status (1 is better, see reference)
sc <- spark_connect(master = "local",
version = "2.4.0")
Re-using existing Spark connection to local
ovarian_tbl <- sdf_copy_to(sc, ovarian, name = "ovarian_tbl", overwrite = TRUE)
#spark.survreg(ovarian_tbl, Surv(futime, fustat) ~ ecog_ps + rx)
partitions <- ovarian_tbl %>%
sdf_partition(training = 0.7, test = 0.3, seed = 1111)
ovarian_training <- partitions$training
ovarian_test <- partitions$test
sur_reg <- ovarian_training %>%
ml_aft_survival_regression(futime ~ ecog_ps + rx + age + resid_ds, censor_col = "fustat")
pred <- ml_predict(sur_reg, ovarian_test)
pred
intercept <- sur_reg$coefficients[1]
coefficients <- sur_reg$coefficients[c(2,3)]
sur_reg$coefficients
(Intercept) ecog_ps rx age resid_ds
11.69793764 -0.27694569 0.65752740 -0.07862066 -0.79872945
plotParams <- round(ovarian[c('resid.ds', 'rx', 'ecog.ps', 'age')])
scale <- exp(intercept + as.matrix(plotParams) * coefficients)
cbind(plotParams, scale)
tSeq <- ovarian$futime # seq(0, 5E3, 50)
probs <- data.frame(t = tSeq)
for (i in 1:2^4) {
probs[, paste("(resid.ds, rx, ecog.ps, age) = (", toString(plotParams[i, ]), ")", sep = "")] <-
pweibull(tSeq, shape = 1, scale = scale[i], lower.tail = F)
}
probs
# MAGIC - **futime**: survival or censoring time
# MAGIC - **fustat**: censoring status
# MAGIC - **age**: in years
# MAGIC - **resid_ds**: residual disease present (1=no, 2=yes)
# MAGIC - **rx**: treatment group
# MAGIC - **ecog.ps**: ECOG performance status (1 is better, see reference)
melted <- melt(probs, id.vars="t", variable.name="group", value.name="prob")
melted
library(ggplot2)
library(reshape2)
ggplot(data=melted, aes(x=t, y=prob, group=group, color=group)) +
geom_point() +
#geom_smooth() +
#geom_jitter() +
labs(x = "time", y = "Survival probability")
gbm_tcga_pub_all Study from cBioPortal# > ovarian_training
# # Source: spark<?> [?? x 6]
# futime fustat age resid_ds rx ecog_ps
# * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 156 1 66.5 2 1 2
# 2 329 1 43.1 2 1 1
# 3 353 1 63.2 1 2 2
# 4 365 1 64.4 2 2 1
# 5 377 0 58.3 1 2 1
# 6 421 0 53.4 2 2 1
# 7 448 0 56.4 1 1 2
# 8 464 1 56.9 2 2 2
# 9 475 1 59.9 2 2 2
# 10 563 1 55.2 1 2 2
# # ... with more rows
clinicalData <- cgdsr::getClinicalData(cgds, "gbm_tcga_pub_all")
#clinicalData <- read.csv("Clinical_tab.csv") #, na.strings=c("","NA")
clinicalData <- clinicalData[c('OS_MONTHS', 'OS_STATUS', 'DFS_STATUS', 'TREATMENT_STATUS' )]
sc <- spark_connect(master = "local",
version = "2.4.0")
Re-using existing Spark connection to local
clinicalData_tbl <- dplyr::copy_to(sc, clinicalData, overwrite = TRUE)
start_time <- Sys.time()
clinicalData_trans_tbl <-
clinicalData_tbl %>%
mutate(OS_STATUS = regexp_replace(OS_STATUS, "LIVING", 1)) %>%
mutate(OS_STATUS = regexp_replace(OS_STATUS, "DECEASED", 0)) %>%
mutate(DFS_STATUS = regexp_replace(DFS_STATUS, "^$|^ $", "DiseaseFree")) %>%
mutate(DFS_STATUS = regexp_replace(DFS_STATUS, "DiseaseFree", 1)) %>%
mutate(DFS_STATUS = regexp_replace(DFS_STATUS, "Recurred", 2)) %>%
mutate(xr = ifelse(TREATMENT_STATUS == "Untreated", 1 , 2)) %>%
mutate(xr = ifelse(TREATMENT_STATUS == "Treated", 2, 1)) %>%
mutate(OS_STATUS = as.numeric(OS_STATUS)) %>%
mutate(DFS_STATUS = as.numeric(DFS_STATUS)) %>%
#arrange(is.na(OS_MONTHS), OS_MONTHS) %>% ## OUFFF put Nan at the end of the column
na.replace(1)
clinicalData_trans <- as.data.frame(clinicalData_trans_tbl)
clinicalData_trans_tbl
partitions_clinicalData <- clinicalData_trans_tbl %>%
sdf_partition(training = 0.7, test = 0.3, seed = 1111)
clinicalData_training <- partitions_clinicalData$training
clinicalData_test <- partitions_clinicalData$test
sur_reg_clinicalData <- clinicalData_training %>%
ml_aft_survival_regression(OS_MONTHS ~ DFS_STATUS + xr, censor_col = "OS_STATUS")
pred_clinicalData <- ml_predict(sur_reg_clinicalData, clinicalData_test)
pred_clinicalData
intercept_clinicalData <- sur_reg_clinicalData$coefficients[1]
coefficients_clinicalData <- sur_reg$coefficients[c(2,3)]
sur_reg_clinicalData$coefficients
(Intercept) DFS_STATUS xr
-2.316624 5.763584 -1.634578
plotParams_clinicalData <- clinicalData_trans[c('DFS_STATUS', 'xr')]
scale_clinicalData <- exp(intercept_clinicalData + as.matrix(plotParams_clinicalData) * coefficients_clinicalData)
cbind(plotParams_clinicalData, scale_clinicalData)
tSeq_clinicalData <- clinicalData_trans$OS_MONTHS # seq(0, 5E3, 50)
probs_clinicalData <- data.frame(t = tSeq_clinicalData)
for (i in 1:5) {
probs_clinicalData[, paste("(DFS_STATUS, xr) = (", toString(plotParams_clinicalData[i, ]), ")", sep = "")] <-
pweibull(tSeq_clinicalData, shape = 1, scale = scale_clinicalData[i], lower.tail = F)
}
probs_clinicalData
melted_clinicalData <- melt(probs_clinicalData, id.vars="t", variable.name="group", value.name="prob")
melted_clinicalData
library(ggplot2)
library(reshape2)
ggplot(data=melted_clinicalData, aes(x=t, y=prob, group=group, color=group)) +
geom_point() +
#geom_smooth() +
#geom_jitter() +
labs(x = "time", y = "Survival probability")